1 The data

data <- readRDS(file="../data/sub_portugal_5provs_4blocks.rds") %>% 
  mutate_at(c("block", "prov", "clon", "tree"), as.factor) %>% # formatting data
  mutate(age.sc = as.vector(scale(age, center = F))) # as age is definite on R+ I would only reduce it..
  # mutate(age.sc = as.vector(scale(age))) # mean centering age

The dataset includes:

  • Height data from a provenance trial (in Portugal) of maritime pine saplings.
  • Randomized block design. Here I selected 5 provenances and 4 blocks.
  • Saplings have different ages: 11, 15, 20 and 27 month old.
table(data$prov,data$block) %>% kable(caption = "Provenance against block number.")
Provenance against block number.
34 35 36 38
LEI 78 71 72 82
MIM 44 45 60 60
PIE 26 29 34 34
SAC 21 20 18 21
VAL 37 43 42 42
table(data$prov,as.factor(data$age)) %>% kable(caption = "Provenance against age (in months).")
Provenance against age (in months).
11 15 20 27
LEI 92 84 65 62
MIM 72 59 40 38
PIE 36 33 27 27
SAC 23 23 17 17
VAL 48 44 37 35
ggplot(data, aes(x=height)) + 
  geom_histogram(color="darkblue", fill="lightblue") + 
  theme_bw()
Height versus age.

Height versus age.

ggplot(data, aes(x=height, color=as.factor(age))) + 
  geom_histogram(fill="white", alpha=0.5, position="identity") + 
  theme_bw()  +
  facet_wrap(~as.factor(age)) + 
  theme(legend.position = "none")
Height versus age.

Height versus age.

plot_grid(ggplot(data, aes(x=age,y=height)) + 
            geom_point(alpha=0.2) + 
            stat_smooth(method = "lm", col = "red") + 
            theme_bw()  +
            theme(axis.title=element_text(size=16)),
          ggplot(data, aes(x=age,y=height)) + 
            geom_point(alpha=0.2) + 
            stat_smooth(method = "lm", col = "red", formula = y~poly(x,2)) + 
            theme_bw() +
            theme(axis.title=element_text(size=16)))
Height versus age.

Height versus age.

plot_grid(ggplot(data, aes(x=age,y=log(height))) + 
            geom_point(alpha=0.2) + 
            stat_smooth(method = "lm", col = "red") + 
            theme_bw()  +
            theme(axis.title=element_text(size=16)),
          ggplot(data, aes(x=age,y=log(height))) + 
            geom_point(alpha=0.2) + 
            stat_smooth(method = "lm", col = "red", formula = y~poly(x,2)) + 
            theme_bw() +
            theme(axis.title=element_text(size=16)))
Height versus age.

Height versus age.

ggplot(data, aes(x=height, color=block)) +
  geom_histogram(fill="white", alpha=0.5, position="identity") + 
  theme_bw()  +
  facet_grid(prov ~block, labeller = label_both) + 
  theme(legend.position = "none")
Height distribution by Provenance and Block.

Height distribution by Provenance and Block.

2 “Fixed effects” models

Dummy variables for each level = Regularized intercepts, because we use weakly informative priors. But no information shared between intercepts. (See P299 in Statistical Rethinking of McElreath)

First, we are going to try three different likehoods: the normal distribution, the normal distribution with a log-transformed response variable and a lognormal distribution.

2.1 Different distributions

2.1.1 Normal distribution mN

Mathematical model

\[\begin{equation} \begin{aligned} h_{i} & \sim \mathcal{N}(\mu_{i},\sigma_{i})\\ \mu_{i} & = \beta_{age}age_{i} + \beta_{age2}age^{2}_{i} + \alpha_{BLOCK[b]} + \alpha_{PROV[p]}\\ \alpha_{BLOCK} & \sim \mathcal{N}(0,10)\\ \alpha_{PROV} & \sim \mathcal{N}(0,10)\\ \beta_{age} & \sim \mathcal{N}(0,10) \\ \beta_{age2} & \sim \mathcal{N}(0,10)\\ \sigma & \sim \text{HalfCauchy}(0,25) \end{aligned} \end{equation}\]

Data in a list.

data.list <- list(N=length(data$height),              # Number of observations
                  y=data$height,                      # Response variables
                  age=data$age.sc,                    # Tree age
                  nprov=length(unique(data$prov)),    # Number of provenances
                  nblock=length(unique(data$block)),  # Number of blocks
                  prov=as.numeric(data$prov),         # Provenances
                  bloc=as.numeric(data$block))        # Blocks
mN = stan_model("mN.stan") 
fit.mN <- sampling(mN, data = data.list, iter = 2000, chains = 2, cores = 2) 
broom::tidyMCMC(fit.mN, 
                pars = c("beta_age","beta_age2", "alpha_prov", "alpha_block", "sigma_y", "lp__"),
                droppars = NULL, estimate.method = "median", ess = T, rhat = T, conf.int = T) %>% 
  kable(caption = "Model mN with a normal distribution")
Model mN with a normal distribution
term estimate std.error conf.low conf.high rhat ess
beta_age 137.13915 7.923203 121.877254 153.31531 0.9991405 2163
beta_age2 151.45624 6.176576 139.145932 163.13511 1.0003495 2356
alpha_prov[1] 45.85882 7.330742 31.699813 60.15782 1.0001671 2286
alpha_prov[2] 29.91831 7.273491 16.116101 44.65727 0.9996011 2554
alpha_prov[3] 11.93355 8.055918 -3.762625 28.12042 0.9997846 2565
alpha_prov[4] 20.06939 8.698003 3.141126 37.19453 1.0009620 2978
alpha_prov[5] 25.40380 8.021383 10.039115 41.70351 0.9991908 2433
alpha_block[1] 21.62438 7.732563 6.777951 36.80888 1.0007131 2861
alpha_block[2] 26.49717 7.669491 11.904573 41.58097 0.9996315 2385
alpha_block[3] 35.18395 7.689631 20.033232 50.79736 0.9998053 2609
alpha_block[4] 49.71694 7.681219 34.859813 64.14071 0.9993917 2420
sigma_y 141.45494 3.679044 134.601916 148.91160 0.9990337 3625
lp__ -5048.54238 2.411856 -5054.131529 -5044.93354 0.9999696 1008

Comparison of the distribution of \(y\) and the posterior predictive distributions (from yrep matrix).

ppc_dens_overlay(y = data$height,
                 as.matrix(fit.mN, pars = "y_rep")[1:50, ]) + 
  theme_bw() + 
  theme(legend.text=element_text(size=25),
        legend.title=element_text(size=18),
        axis.text = element_text(size=18),
        legend.position = c(0.8,0.6))

2.1.2 Normal distribution with log(y) mNlogy

Mathematical model

\[\begin{equation} \begin{aligned} \text{log}(h_{i}) & \sim \mathcal{N}(\mu_{i},\sigma_{i})\\ \mu_{i} & = \beta_{age}age_{i} + \beta_{age2}age^{2}_{i} + \alpha_{BLOCK[b]} + \alpha_{PROV[p]}\\ \alpha_{BLOCK} & \sim \mathcal{N}(0,10)\\ \alpha_{PROV} & \sim \mathcal{N}(0,10)\\ \beta_{age} & \sim \mathcal{N}(0,10) \\ \beta_{age2} & \sim \mathcal{N}(0,10)\\ \sigma & \sim \text{HalfCauchy}(0,25) \end{aligned} \end{equation}\]
data.list.mNlogy <- list(N=length(data$height),              # Number of observations
                  y=log(data$height),                      # log(response variable)
                  age=data$age.sc,                         # Tree age
                  nprov=length(unique(data$prov)),         # Number of provenances
                  nblock=length(unique(data$block)),       # Number of blocks
                  prov=as.numeric(data$prov),              # Provenances
                  bloc=as.numeric(data$block))             # Blocks

Same stan code as the previous model!

mNlogy = stan_model("mNlogy.stan")
fit.mNlogy <- sampling(mNlogy, data = data.list.mNlogy, iter = 2000, chains = 2, 
                       control=list(max_treedepth=14), cores = 2, save_warmup = F)
broom::tidyMCMC(fit.mNlogy, 
                pars = c("beta_age", "beta_age2", "alpha_prov", "alpha_block", "sigma_y", "lp__"),
                droppars = NULL, estimate.method = "median", ess = T, rhat = T, conf.int = T) %>% 
  kable(caption = "Model mNlogy with a normal distribution and a log-transformed response variable")
Model mNlogy with a normal distribution and a log-transformed response variable
term estimate std.error conf.low conf.high rhat ess
beta_age 3.0579301 0.3350220 2.4019656 3.7218651 1.001877 561
beta_age2 -0.8536282 0.1599965 -1.1683883 -0.5375173 1.001921 561
alpha_prov[1] 1.6513564 3.4485964 -5.8734695 7.8363884 1.017059 226
alpha_prov[2] 1.6023959 3.4494309 -5.9057697 7.7596777 1.016993 226
alpha_prov[3] 1.4990742 3.4479495 -6.0281759 7.6769490 1.017075 225
alpha_prov[4] 1.6128414 3.4481695 -5.9498839 7.7709924 1.017021 226
alpha_prov[5] 1.6099099 3.4488907 -5.9272419 7.7592365 1.017103 226
alpha_block[1] 2.0575301 3.4408496 -4.1398833 9.5821414 1.017544 225
alpha_block[2] 2.0840563 3.4407266 -4.1017959 9.6248734 1.017555 225
alpha_block[3] 2.1086391 3.4405669 -4.0651311 9.6285910 1.017572 225
alpha_block[4] 2.2099892 3.4412635 -3.9687104 9.7504930 1.017570 225
sigma_y 0.3958537 0.0099854 0.3777618 0.4162877 1.005812 531
lp__ 372.5545003 2.4294536 366.5960456 375.9544738 1.000135 514
  • There may be a warning about effective sample size. This warning should disappear if we increase the number of iterations to 2,500.

  • We have to increase the maximum treedepth: See the Brief Guide to Stan’s Warnings

  • lp has largely increased.

Comparison of the distribution of \(y\) and the posterior predictive distributions (from yrep matrix).

y_rep <- as.matrix(fit.mNlogy, pars = "y_rep")
ppc_dens_overlay(y =log(data$height),
                 as.matrix(fit.mNlogy, pars = "y_rep")[1:50, ]) + 
  theme_bw() + 
  theme(legend.text=element_text(size=25), legend.title=element_text(size=18), 
        axis.text = element_text(size=18), legend.position = c(0.8,0.6))

A better fit than mN.

2.1.3 LogNormal distribution mLogNR

Mathematical model

\[\begin{equation} \begin{aligned} h_{i} & \sim \text{LogNormal}(\mu_{i},\sigma_{i})\\ \mu_{i} & = \beta_{age}age_{i} + \beta_{age2}age^{2}_{i} + \alpha_{BLOCK[b]} + \alpha_{PROV[p]}\\ \alpha_{BLOCK} & \sim \mathcal{N}(0,10)\\ \alpha_{PROV} & \sim \mathcal{N}(0,10)\\ \beta_{age} & \sim \mathcal{N}(0,10) \\ \beta_{age2} & \sim \mathcal{N}(0,10)\\ \sigma & \sim \text{HalfCauchy}(0,25) \end{aligned} \end{equation}\]

We are going to use the same data list as in the first model mN.

mLogNR = stan_model("mLogNR.stan") 
fit.mLogNR <- sampling(mLogNR, data = data.list, iter = 2000, 
                       chains = 2, control=list(max_treedepth=14), save_warmup = F)
broom::tidyMCMC(fit.mLogNR, 
                pars = c("beta_age", "beta_age2", "alpha_prov", "alpha_block", "sigma_y", "lp__"),
                droppars = NULL, estimate.method = "median", ess = T, rhat = T, conf.int = T) %>% 
  kable(caption = "Model mLogNR with a LogNormal distribution on R")
Model mLogNR with a LogNormal distribution on R
term estimate std.error conf.low conf.high rhat ess
beta_age 3.0555719 0.3275468 2.3713162 3.6605593 0.9992367 634
beta_age2 -0.8543036 0.1573536 -1.1465293 -0.5354714 0.9992856 628
alpha_prov[1] 2.1217546 3.4084211 -4.3739902 8.6881716 1.0131870 265
alpha_prov[2] 2.0607284 3.4087468 -4.4315566 8.6440914 1.0131141 265
alpha_prov[3] 1.9586754 3.4103368 -4.5475392 8.5584074 1.0130917 265
alpha_prov[4] 2.0703715 3.4066068 -4.4759199 8.6313427 1.0131313 265
alpha_prov[5] 2.0709604 3.4086466 -4.4550309 8.6671989 1.0131089 265
alpha_block[1] 1.6380233 3.3995181 -4.9401125 8.1192378 1.0134022 265
alpha_block[2] 1.6845306 3.4004451 -4.9253197 8.1341568 1.0133324 265
alpha_block[3] 1.6904876 3.3992693 -4.9123654 8.1533350 1.0133388 265
alpha_block[4] 1.7968742 3.4003061 -4.8303802 8.2700915 1.0133413 265
sigma_y 0.3967370 0.0094583 0.3804775 0.4175419 1.0028582 508
lp__ 372.6176314 2.4261156 366.8486216 376.1194057 1.0031655 542

Comparison of the distribution of \(y\) and the posterior predictive distributions (from yrep matrix).

ppc_dens_overlay(y = data$height,
                 as.matrix(fit.mLogNR, pars = "y_rep")[1:50, ])  + 
  theme_bw() + 
  theme(legend.text=element_text(size=25), legend.title=element_text(size=18),
        axis.text = element_text(size=18), legend.position = c(0.8,0.6))

2.2 More informative priors

In the previous models, our priors are very weakly informative. We can use a little more informative priors, that can help convergence and decrease the running time. Belox, we refit the model mNlogy (normal distribution with log(y)) with more informative priors.

Variance priors

Prior recommendations for scale parameters in hierarchical models

  • Very weakly informative prior: \(\sigma \sim \text{HalfCauchy}(0,25)\). From Gelman (2006): 8-schools example (p430). And here.

  • Weakly informative prior:

    • \(\sigma \sim \text{HalfCauchy}(0,1)\) (McElreath, First version) \(\sigma \sim \text{HalfCauchy}(0,5)\) (Betancourt in 8-schools example)

    • \(\sigma \sim \text{exponential}(1)\) (McElreath, Second version) or \(\sigma \sim \text{exponential}(0.1)\)

    • \(\sigma \sim \text{LogNormal}(0,1)\) (McElreath, Second version)

  • More informative prior : \(\sigma \sim \text{HalfNormal}(0,1)\) or \(\sigma \sim \text{Half-t}(3,0,1)\)

Beta priors

  • More informative priors: \(\beta \sim \mathcal{N}(0,1)\).

  • We can also consider that \(age\) is positively associated with height so we can add some information in our model and constrain \(\beta_{age}\) to positive values, such as: \(\beta_{age} \sim \text{LogNormal}(0,1)\) or \(\beta_{age} \sim \text{Gamma}(?,?)\).

2.2.1 Exponential sigma mNlogySigmaPrior

Mathematical model

\[\begin{equation} \begin{aligned} \text{log}(h_{i}) & \sim \mathcal{N}(\mu_{i},\sigma_{i})\\ \mu_{i} & = \beta_{age}age_{i} + \beta_{age2}age^{2}_{i} + \alpha_{BLOCK[b]} + \alpha_{PROV[p]}\\ \alpha_{BLOCK} & \sim \mathcal{N}(0,10)\\ \alpha_{PROV} & \sim \mathcal{N}(0,10)\\ \beta_{age} & \sim \mathcal{N}(0,10) \\ \beta_{age2} & \sim \mathcal{N}(0,10)\\ \sigma & \sim \text{Exponential}(1) \end{aligned} \end{equation}\]
mNlogySigmaPrior = stan_model("mNlogySigmaPrior.stan")
fit.mNlogySigmaPrior <- sampling(mNlogySigmaPrior, data = data.list.mNlogy, iter = 2000, chains = 2, cores = 2,
                        control=list(max_treedepth=14), save_warmup = F)
broom::tidyMCMC(fit.mNlogySigmaPrior, 
                pars = c("beta_age", "beta_age2", "alpha_prov", "alpha_block", "sigma_y", "lp__"),
                droppars = NULL, estimate.method = "median", ess = T, rhat = T, conf.int = T) %>% 
  kable(caption = "Model mNlogySigmaPrior with a normal distribution and a log-transformed response variable")
Model mNlogySigmaPrior with a normal distribution and a log-transformed response variable
term estimate std.error conf.low conf.high rhat ess
beta_age 3.0107288 0.3388130 2.3715193 3.6906658 1.0002014 679
beta_age2 -0.8314108 0.1619515 -1.1562546 -0.5276815 1.0002880 678
alpha_prov[1] 1.1708538 3.1865215 -5.0398797 7.5622102 1.0143187 222
alpha_prov[2] 1.1293683 3.1867863 -5.1017683 7.4990827 1.0142846 222
alpha_prov[3] 1.0029820 3.1870150 -5.1622819 7.3811998 1.0143557 222
alpha_prov[4] 1.1032702 3.1862411 -5.0818598 7.4614433 1.0144236 222
alpha_prov[5] 1.1352337 3.1864730 -5.0916072 7.5086448 1.0142397 222
alpha_block[1] 2.5951974 3.1985722 -3.9152278 8.8790141 1.0143058 222
alpha_block[2] 2.6249875 3.1987471 -3.8725381 8.9153136 1.0143092 222
alpha_block[3] 2.6452764 3.1990859 -3.8989140 8.9462457 1.0142977 222
alpha_block[4] 2.7457830 3.1988621 -3.7875396 9.0413565 1.0142977 222
sigma_y 0.3956953 0.0096858 0.3783233 0.4158609 1.0039343 690
lp__ 372.2541210 2.4180218 366.3082466 375.5637982 0.9993408 678

Comparison of the distribution of \(y\) and the posterior predictive distributions (from yrep matrix).

ppc_dens_overlay(y =log(data$height),
                 as.matrix(fit.mNlogySigmaPrior, pars = "y_rep")[1:50, ]) + 
  theme_bw() + 
  theme(legend.text=element_text(size=25), legend.title=element_text(size=18), 
        axis.text = element_text(size=18), legend.position = c(0.8,0.6))

2.2.2 More informative intercepts and slopes mNlogyBetaAlphaPrior

Mathematical model

\[\begin{equation} \begin{aligned} \text{log}(h_{i}) & \sim \mathcal{N}(\mu_{i},\sigma_{i})\\ \mu_{i} & = \beta_{age}age_{i} + \beta_{age2}age^{2}_{i} + \alpha_{BLOCK[b]} + \alpha_{PROV[p]}\\ \alpha_{BLOCK} & \sim \mathcal{N}(0,1)\\ \alpha_{PROV} & \sim \mathcal{N}(0,1)\\ \beta_{age} & \sim \mathcal{N}(0,1) \\ \beta_{age2} & \sim \mathcal{N}(0,1)\\ \sigma & \sim \text{Exponential}(1) \end{aligned} \end{equation}\]
mNlogyBetaAlphaPrior = stan_model("mNlogyBetaAlphaPrior.stan")
fit.mNlogyBetaAlphaPrior <- sampling(mNlogyBetaAlphaPrior, data = data.list.mNlogy, iter = 2000, chains = 2, cores = 2,
                        control=list(max_treedepth=14), save_warmup = F)
broom::tidyMCMC(fit.mNlogyBetaAlphaPrior, 
                pars = c("beta_age", "beta_age2", "alpha_prov", "alpha_block", "sigma_y", "lp__"),
                droppars = NULL, estimate.method = "median", ess = T, rhat = T, conf.int = T) %>% 
  kable(caption = "Model mNlogyBetaAlphaPrior with a normal distribution and a log-transformed response variable")
Model mNlogyBetaAlphaPrior with a normal distribution and a log-transformed response variable
term estimate std.error conf.low conf.high rhat ess
beta_age 3.0868389 0.3016956 2.4850452 3.6804960 1.000580 727
beta_age2 -0.8649354 0.1445793 -1.1439516 -0.5865007 1.000900 722
alpha_prov[1] 1.6835563 0.3265511 1.0970084 2.3686384 1.003928 412
alpha_prov[2] 1.6324825 0.3275679 1.0234395 2.3139650 1.003899 417
alpha_prov[3] 1.5305514 0.3281026 0.9259619 2.2072922 1.004010 421
alpha_prov[4] 1.6409676 0.3279010 1.0279154 2.3152684 1.003384 418
alpha_prov[5] 1.6390326 0.3272003 1.0523706 2.3203585 1.004124 417
alpha_block[1] 2.0271343 0.3357225 1.3103509 2.6627104 1.004638 428
alpha_block[2] 2.0561796 0.3351832 1.3451784 2.6817137 1.004626 433
alpha_block[3] 2.0701832 0.3355333 1.3652459 2.6955298 1.004339 433
alpha_block[4] 2.1746311 0.3363378 1.4604000 2.8231571 1.004512 432
sigma_y 0.3963958 0.0091220 0.3797049 0.4147040 1.000697 1081
lp__ 352.2499899 2.4029203 346.4175783 355.6635881 1.001058 584

Comparison of the distribution of \(y\) and the posterior predictive distributions (from yrep matrix).

ppc_dens_overlay(y =log(data$height),
                 as.matrix(fit.mNlogyBetaAlphaPrior, pars = "y_rep")[1:50, ]) + 
  theme_bw() + 
  theme(legend.text=element_text(size=25), legend.title=element_text(size=18), 
        axis.text = element_text(size=18), legend.position = c(0.8,0.6))

Comment: The intercept parameters change between the two models (mNlogyBetaAlphaPrior and mNlogySigmaPrior).

2.3 Vectorized models

We use again the model mNlogy (normal distribution with log(y) and very weakly informative priors)

Mathematical model

\[\begin{equation} \begin{aligned} \text{log}(h_{i}) & \sim \mathcal{N}(\mu_{i},\sigma_{i})\\ \mu_{i} & = \beta_{age}age_{i} + \beta_{age2}age^{2}_{i} + \alpha_{BLOCK[b]} + \alpha_{PROV[p]}\\ \alpha_{BLOCK} & \sim \mathcal{N}(0,10)\\ \alpha_{PROV} & \sim \mathcal{N}(0,10)\\ \beta_{age} & \sim \mathcal{N}(0,10) \\ \beta_{age2} & \sim \mathcal{N}(0,10)\\ \sigma & \sim \text{HalfCauchy}(0,25) \end{aligned} \end{equation}\]

2.3.1 Option 1

mNlogyVectorize1 = stan_model("mNlogyVectorize1.stan")
fit.mNlogyVectorize1 <- sampling(mNlogyVectorize1, data = data.list.mNlogy, iter = 2000,
                        control=list(max_treedepth=14),
                                chains = 2, cores = 2, save_warmup = F)
broom::tidyMCMC(fit.mNlogyVectorize1, 
                pars = c("beta_age", "alpha_prov", "alpha_block", "sigma_y", "lp__"),
                droppars = NULL, estimate.method = "median", ess = T, rhat = T, conf.int = T) %>% 
  kable(caption = "Model mNlogyVectorize1 with a normal distribution and log(y)")
Model mNlogyVectorize1 with a normal distribution and log(y)
term estimate std.error conf.low conf.high rhat ess
beta_age 3.0295758 0.3437384 2.3854121 3.7243783 1.0025844 517
alpha_prov[1] 1.7320294 3.4026410 -4.7045217 8.3893157 1.0063156 320
alpha_prov[2] 1.6830447 3.4025796 -4.7571554 8.3275703 1.0063898 320
alpha_prov[3] 1.5823837 3.4036095 -4.9119571 8.2331163 1.0064110 320
alpha_prov[4] 1.6850773 3.4019339 -4.7694287 8.3539318 1.0063527 320
alpha_prov[5] 1.6921565 3.4023780 -4.7398002 8.3212315 1.0063940 320
alpha_block[1] 1.9698698 3.4082661 -4.5266003 8.5171252 1.0065503 319
alpha_block[2] 2.0003218 3.4086478 -4.5091504 8.4954238 1.0065544 319
alpha_block[3] 2.0176696 3.4082629 -4.4820293 8.5405708 1.0065507 319
alpha_block[4] 2.1158136 3.4083847 -4.4289952 8.6266011 1.0066272 319
sigma_y 0.3966256 0.0095206 0.3782708 0.4152456 1.0000090 896
lp__ 372.6274861 2.3729074 366.7477482 376.0156133 0.9993589 705
  • Still need to increase max_treedepth here !

2.3.2 Option 2

mNlogyVectorize2 = stan_model("mNlogyVectorize2.stan") 
fit.mNlogyVectorize2 <- sampling(mNlogyVectorize2, data = data.list.mNlogy, iter = 2000,
                        control=list(max_treedepth=14),
                                 chains = 2, cores = 2, save_warmup = F) 
## Warning: The largest R-hat is 1.06, indicating chains have not mixed.
## Running the chains for more iterations may help. See
## http://mc-stan.org/misc/warnings.html#r-hat
## Warning: Bulk Effective Samples Size (ESS) is too low, indicating posterior means and medians may be unreliable.
## Running the chains for more iterations may help. See
## http://mc-stan.org/misc/warnings.html#bulk-ess
broom::tidyMCMC(fit.mNlogyVectorize2, 
                pars = c("beta_age", "alpha_prov", "alpha_block", "sigma_y", "lp__"),
                droppars = NULL, estimate.method = "median", ess = T, rhat = T, conf.int = T) %>% 
  kable(caption = "Model mNlogyVectorize2 with a normal distribution and log(y)")
Model mNlogyVectorize2 with a normal distribution and log(y)
term estimate std.error conf.low conf.high rhat ess
beta_age 0.4091226 0.0138510 0.3821943 0.4360379 0.9999335 838
alpha_prov[1] 1.8450212 3.2578420 -4.5137258 8.3510905 1.0605418 78
alpha_prov[2] 1.7888478 3.2562462 -4.5845030 8.2720471 1.0606634 77
alpha_prov[3] 1.6946440 3.2581851 -4.6910023 8.2088985 1.0604755 78
alpha_prov[4] 1.8247183 3.2568439 -4.5790838 8.3067101 1.0605361 78
alpha_prov[5] 1.8145836 3.2562120 -4.5532209 8.2905805 1.0604501 78
alpha_block[1] 3.1120838 3.2552218 -3.3770905 9.4613277 1.0604816 78
alpha_block[2] 3.1419801 3.2548740 -3.3577467 9.4848990 1.0606863 78
alpha_block[3] 3.1441316 3.2547984 -3.3183893 9.5163807 1.0607813 77
alpha_block[4] 3.2769549 3.2562493 -3.2338038 9.5880708 1.0605484 78
sigma_y 0.4094675 0.0095676 0.3926185 0.4299091 1.0026968 1152
lp__ -505.5094293 2.4034737 -510.7965897 -501.7509199 1.0001844 614

What does this model have a negative lp_ ? Longer to run also, and low number of effective sample size. Why?

Let’s compare the speed of the 5 models with a normal distribution with log(y)

lapply(lapply(c(fit.mNlogy, fit.mNlogySigmaPrior,fit.mNlogyBetaAlphaPrior, fit.mNlogyVectorize1, fit.mNlogyVectorize2), 
       get_elapsed_time), data.frame) %>% 
  bind_rows(.id = "model") %>% 
  mutate(model = recode_factor(model, 
                               "1" = "Model mNlogy with $\\sigma \\sim \\text{HalfCauchy}(0,25)$",
                               "2" = "Model mNlogySigmaPrior with $\\sigma \\sim \\text{Exponential}(1)$",
                               "3" = "Model mNlogyBetaAlphaPrior with $\\beta$ and $\\alpha  \\sim \\mathcal{N}(0,1)$ ",
                               "4" = "Model mNlogyVectorize1",
                               "5" = "Model mNlogyVectorize2")) %>% 
  mutate(total = warmup + sample) %>% 
  arrange(total) %>% 
  kable(caption = "Model speed comparison of models with a normal distribution and a log-transformed response variable.")
Model speed comparison of models with a normal distribution and a log-transformed response variable.
model warmup sample total
Model mNlogyBetaAlphaPrior with \(\beta\) and \(\alpha \sim \mathcal{N}(0,1)\) 17.4769 19.8944 37.3713
Model mNlogyBetaAlphaPrior with \(\beta\) and \(\alpha \sim \mathcal{N}(0,1)\) 16.9092 23.2911 40.2003
Model mNlogyVectorize2 27.6535 31.0206 58.6741
Model mNlogyVectorize1 26.9988 31.8640 58.8628
Model mNlogyVectorize1 28.8041 30.4957 59.2998
Model mNlogyVectorize2 29.4740 31.2363 60.7103
Model mNlogy with \(\sigma \sim \text{HalfCauchy}(0,25)\) 67.4086 76.5003 143.9089
Model mNlogy with \(\sigma \sim \text{HalfCauchy}(0,25)\) 74.6366 71.0961 145.7327
Model mNlogySigmaPrior with \(\sigma \sim \text{Exponential}(1)\) 69.9210 75.9105 145.8315
Model mNlogySigmaPrior with \(\sigma \sim \text{Exponential}(1)\) 70.7925 82.7382 153.5307

For the subsequent models, we will use the more informative priors of mNlogyBetaAlphaPrior.

3 Multilevel models with one-varying intercept

Adaptive regularization

Links:

We are going to

3.1 Normal distribution with log(y)

3.1.1 Centered parameterization mmNlogyC

P357 McElreath (first version).

mmNlogyC (multi-level model with normal distribution and log(y) - centered paramerization)

Mathematical model

\[\begin{equation} \begin{aligned} \text{log}(h_{i}) & \sim \mathcal{N}(log(\mu_{i}),\sigma_{i})\\ \mu_{i} & = \beta_{age}age_{i} + \beta_{age2}age^{2}_{i} + \alpha_{PROV[p]}\\ \alpha_{PROV} & \sim \mathcal{N}(\mu_{\alpha_{PROV}},\sigma_{\alpha_{PROV}})\\ \mu_{\alpha_{PROV}} & \sim \mathcal{N}(0,1)\\ \sigma_{\alpha_{PROV}}& \sim \text{Exponential}(1)\\ \beta_{age} & \sim \mathcal{N}(0,1) \\ \beta_{age2} & \sim \mathcal{N}(0,1)\\ \sigma & \sim \text{Exponential}(1) \end{aligned} \end{equation}\]
data.list.reduced <- list(N=length(data$height),         # Number of observations
                  y=log(data$height),                    # Log(Response variable)
                  age=data$age.sc,                       # Tree age
                  nprov=length(unique(data$prov)),       # Number of provenances
                  prov=as.numeric(data$prov))            # Provenances
mmNlogyC <- stan_model("mmNlogyC.stan")  
fit.mmNlogyC <- sampling(mmNlogyC, data = data.list.reduced , iter = 2000, chains = 2, 
                                 cores = 2, control=list(max_treedepth=14), save_warmup = F)
## Warning: There were 2 divergent transitions after warmup. Increasing adapt_delta above 0.8 may help. See
## http://mc-stan.org/misc/warnings.html#divergent-transitions-after-warmup
## Warning: Examine the pairs() plot to diagnose sampling problems
broom::tidyMCMC(fit.mmNlogyC,
                pars = c("beta_age", "beta_age2", "mean_alpha_prov", "sigma_alpha_prov",
                         "sigma_y", "lp__"),
                droppars = NULL, estimate.method = "median", ess = T, rhat = T, conf.int = T) %>%
  kable(caption = "One-varying intercept model mmNlogyC with centered parameterization")
One-varying intercept model mmNlogyC with centered parameterization
term estimate std.error conf.low conf.high rhat ess
beta_age 2.9019767 0.3052398 2.3184176 3.4895899 1.008593 223
beta_age2 -0.7748249 0.1468177 -1.0640999 -0.4943766 1.008222 230
mean_alpha_prov 3.7900529 0.1500195 3.4973060 4.0789278 1.009119 227
sigma_alpha_prov 0.0606109 0.0584825 0.0172239 0.2093643 1.001321 491
sigma_y 0.3998205 0.0097379 0.3813280 0.4185636 1.002163 820
lp__ 362.6685007 2.5988537 356.5332759 366.4661911 1.002579 513

This model has some divergent transitions, low number of effective sample size and R-hat not exactly equal to one.

Divergent transitions

  • To avoid divergent transitions, we can increase adapt_delta..

McEleath (Second version) : “[…] the target acceptance rate is controlled by the adapt_delta control parameter. The default is 0.95, which means that it aims to attain a 95% acceptance rate. It tries this during the warmup phase, adjusting the step size of each leapfrog step (go back to Chapter 9 if these terms aren’t familiar). When adapt_delta is set high, it results in a smaller step size, which means a more accurate approximation of the curved surface. It also means more computation, which means a slower chain. Increasing adapt_delta can often, but not always, help with divergent transitions.”

  • You can also try to add some information in your model, for instance with \(\beta_{age} \sim \text{LogNormal}(0,1)\).

  • Or you can reparameterize your model with the non-centered parameterization. (Best option!!)

ppc_dens_overlay(y = log(data$height),
                 as.matrix(fit.mmNlogyC, pars = "y_rep")[1:50, ]) + 
  theme_bw() +
  theme(legend.text=element_text(size=25), legend.title=element_text(size=18),
        axis.text = element_text(size=18), legend.position = c(0.8,0.6))

mcmc_trace(as.array(fit.mmNlogyC), pars = c("alpha_prov[3]","sigma_alpha_prov"), 
           np = nuts_params(fit.mmNlogyC)) + 
  xlab("Post-warmup iteration")

mcmc_pairs(as.array(fit.mmNlogyC), np = nuts_params(fit.mmNlogyC), 
           pars = c("beta_age","beta_age2","mean_alpha_prov","sigma_alpha_prov","alpha_prov[3]","sigma_y"), 
           off_diag_args = list(size = 1, alpha = 1/3),
           np_style = pairs_style_np(div_size=2, div_shape = 19))

3.1.2 Non-centered parameterization mmNlogyNC

Links:


From McElreath, P429 (13.4.2.) of Statistical Rethinking (second version)

\[ \alpha \sim \mathcal{N}(\mu,\sigma)\]

is equivalent to

\[\begin{equation} \begin{aligned} \alpha &= \mu + \beta\\ \beta &\sim \mathcal{N}(0,\sigma) \end{aligned} \end{equation}\]

is equivalent to

\[\begin{equation} \begin{aligned} \alpha &= \mu + z\sigma\\ z &\sim \mathcal{N}(0,1) \end{aligned} \end{equation}\]

No parameters are left inside the prior.


From Updating: A Set of Bayesian Notes. Jeffrey B. Arnold. 20 Multilevel Models

These are two ways of writing the same model. However, they change the parameters that the HMC algorithm is actively sampling and thus can have different sampling performance.

However, neither is universally better.

  • If much data, the non-centered parameterization works better
  • If less data, the centered parameterization works better

And there is currently no ex-ante way to know which will work better, and at what amount of “data” that the performance of one or the other is better. However, one other reason to use the centered parameterization (if it is also scaled), is that the Stan HMC implementation tends to be more efficient if all parameters are on the scale.


mmNlogyNC (multi-level model with normal distribution and log(y) - non-centered paramerization)

Mathematical model

\[\begin{equation} \begin{aligned} \text{log}(h_{i}) & \sim \mathcal{N}(log(\mu_{i}),\sigma_{i})\\ \mu_{i} & = \alpha + \beta_{age}age_{i} + \beta_{age2}age^{2}_{i} + z_{PROV[p]}\sigma_{PROV}\\ \alpha & \sim \mathcal{N}(0,1)\\ z_{PROV} & \sim \mathcal{N}(0,1)\\ \sigma_{\alpha_{PROV}}& \sim \text{Exponential}(1)\\ \beta_{age} & \sim \mathcal{N}(0,1) \\ \beta_{age2} & \sim \mathcal{N}(0,1)\\ \sigma & \sim \text{Exponential}(1) \end{aligned} \end{equation}\]
mmNlogyNC <- stan_model("mmNlogyNC.stan")
fit.mmNlogyNC <- sampling(mmNlogyNC, data = data.list.reduced, iter = 2000, chains = 2,
                          cores = 2, control=list(max_treedepth=14), save_warmup = F)
broom::tidyMCMC(fit.mmNlogyNC,
                pars = c("beta_age", "beta_age2", "z_prov", "sigma_prov","alpha",
                         "sigma_y", "lp__"),
                droppars = NULL, estimate.method = "median", ess = T, rhat = T, conf.int = T) %>%
  kable(caption = "One-varying intercept model mmNlogyNC with non-centered parameterization")
One-varying intercept model mmNlogyNC with non-centered parameterization
term estimate std.error conf.low conf.high rhat ess
beta_age 2.8963848 0.2936762 2.3219892 3.4556964 0.9997297 707
beta_age2 -0.7719118 0.1417713 -1.0441907 -0.4943003 0.9996088 721
z_prov[1] 0.8382786 0.6527829 -0.3638923 2.2019258 1.0002800 959
z_prov[2] 0.1255077 0.6335012 -1.1256549 1.3308042 0.9992908 1033
z_prov[3] -1.0181842 0.7390431 -2.5026089 0.4347973 1.0011316 771
z_prov[4] 0.1321020 0.6979406 -1.3108607 1.4895910 0.9992205 1224
z_prov[5] 0.2273153 0.6506613 -1.0936794 1.5297297 0.9995888 1093
sigma_prov 0.0604593 0.0679792 0.0147948 0.2267167 1.0027545 434
alpha 3.7902642 0.1495662 3.4877374 4.0659309 0.9995612 659
sigma_y 0.4000639 0.0099042 0.3811835 0.4203267 0.9991321 1643
lp__ 348.5608194 2.9013131 341.8144068 352.9438544 1.0041025 412

No more divergent transitions! But still low effective sample size.

ppc_dens_overlay(y = log(data$height),
                 as.matrix(fit.mmNlogyNC, pars = "y_rep")[1:50, ]) +
  theme_bw() + 
  theme(legend.text=element_text(size=25), legend.title=element_text(size=18),
        axis.text = element_text(size=18), legend.position = c(0.8,0.6))

mcmc_trace(as.array(fit.mmNlogyNC), 
           pars =c( "z_prov[3]","sigma_prov"), 
           np = nuts_params(fit.mmNlogyNC)) + 
  xlab("Post-warmup iteration")
## No divergences to plot.

mcmc_pairs(as.array(fit.mmNlogyNC), np = nuts_params(fit.mmNlogyNC),
           pars = c("beta_age","beta_age2","alpha","sigma_prov","z_prov[3]","sigma_y"), 
           off_diag_args = list(size = 1, alpha = 1/3),
           np_style = pairs_style_np(div_size=3, div_shape = 19))

3.2 Lognormal distribution

3.2.1 Non-centered parameterization mmLogNnc


For a lognormal distribution the non-centered parameterisation is different:

\[\alpha \sim \mathcal{logN}(log(\mu),\sigma)\]

is equivalent to

\[\begin{equation} \begin{aligned} \alpha &= e^{log(\mu) + z.\sigma} \\ z &\sim \mathcal{N}(0,1) \end{aligned} \end{equation}\]

mmLogNnc (multi-level LogNormal dsitribution - non-centered paramerization)

Mathematical model

\[\begin{equation} \begin{aligned} h_{i} & \sim \text{LogNormal}(log(\mu_{i}),\sigma_{i})\\ \mu_{i} & = e^{log(\alpha) + z_{PROV[p]}\sigma_{PROV}} + \beta_{age}age_{i} + \beta_{age2}age^{2}_{i} \\ \alpha & \sim \text{LogNormal}(0,1) \\ \beta_{age} & \sim \mathcal{N}(0,1) \\ \beta_{age2} & \sim \mathcal{N}(0,1)\\ z_{PROV} & \sim \mathcal{N}(0,1)\\ \sigma_{PROV} & \sim \text{Exponential}(1)\\ \sigma & \sim \text{Exponential}(1) \end{aligned} \end{equation}\]
data.list.reduced.logN <- list(N=length(data$height),         # Number of observations
                  y=data$height,                              # Response variable
                  age=data$age.sc,                            # Tree age
                  nprov=length(unique(data$prov)),            # Number of provenances
                  prov=as.numeric(data$prov))                 # Provenances
mmLogNnc<- stan_model("mmLogNnc.stan")
fit.mmLogNnc <- sampling(mmLogNnc, data = data.list.reduced.logN, iter = 2000, chains = 2,
                          cores = 2, control=list(max_treedepth=14), save_warmup = F)
## Warning: There were 16 divergent transitions after warmup. Increasing adapt_delta above 0.8 may help. See
## http://mc-stan.org/misc/warnings.html#divergent-transitions-after-warmup
## Warning: Examine the pairs() plot to diagnose sampling problems
## Warning: Tail Effective Samples Size (ESS) is too low, indicating posterior variances and tail quantiles may be unreliable.
## Running the chains for more iterations may help. See
## http://mc-stan.org/misc/warnings.html#tail-ess
broom::tidyMCMC(fit.mmLogNnc,
                pars = c("beta_age", "beta_age2", "z_prov", "sigma_prov","alpha",
                         "sigma_y", "lp__"),
                droppars = NULL, estimate.method = "median", ess = T, rhat = T, conf.int = T) %>%
  kable(caption = "One-varying intercept model mmLogNnc with Lognormal non-centered parameterization")
One-varying intercept model mmLogNnc with Lognormal non-centered parameterization
term estimate std.error conf.low conf.high rhat ess
beta_age 1.0893722 1.0166708 -0.8637837 3.1690709 0.9991699 1714
beta_age2 2.2029369 0.9547520 0.3538261 4.1541712 1.0008084 1847
z_prov[1] 0.6858172 0.7519873 -0.8962745 2.1977663 1.0005479 486
z_prov[2] -0.2466287 0.7600026 -1.7962695 1.3257914 1.0020855 318
z_prov[3] -0.4903074 0.8381313 -2.1682095 1.1721589 1.0072173 527
z_prov[4] 0.2150199 0.7882349 -1.4275055 1.7074164 1.0001439 870
z_prov[5] 0.3250498 0.7586772 -1.2573239 1.6897075 1.0028006 506
sigma_prov 0.0505327 0.0533628 0.0030357 0.2043873 1.0014523 433
alpha 314.2389761 13.7904298 277.6893436 335.2116801 1.0107708 114
sigma_y 0.5777587 0.0136868 0.5525291 0.6051340 0.9992581 1949
lp__ 15.9954343 2.7988806 9.9019425 20.6543876 1.0020345 502
ppc_dens_overlay(y = data$height,
                 as.matrix(fit.mmLogNnc, pars = "y_rep")[1:50, ]) +
  theme_bw() + 
  theme(legend.text=element_text(size=25), legend.title=element_text(size=18),
        axis.text = element_text(size=18), legend.position = c(0.8,0.6))

mcmc_trace(as.array(fit.mmLogNnc), 
           pars =c( "z_prov[3]","sigma_prov"), 
           np = nuts_params(fit.mmLogNnc)) + 
  xlab("Post-warmup iteration")

mcmc_pairs(as.array(fit.mmLogNnc), np = nuts_params(fit.mmLogNnc),
           pars = c("beta_age","beta_age2","alpha","sigma_prov","z_prov[3]","sigma_y"), 
           off_diag_args = list(size = 1, alpha = 1/3),
           np_style = pairs_style_np(div_size=3, div_shape = 19))

4 Two varying intercepts (prov and block)

4.1 Centered parameterization (what we shouldn’t do)

\[\begin{equation} \begin{aligned} h_{i} & \sim \text{LogNormal}(\mu_{i},\sigma_{i})\\ \mu_{i} & = \beta_{age}age_{i} + \beta_{age2}age^{2}_{i} + \alpha_{BLOCK[b]} + \alpha_{PROV[p]}\\ \beta_{age} & \sim \mathcal{N}(0,1) \\ \beta_{age2} & \sim \mathcal{N}(0,1)\\ \alpha_{BLOCK} & \sim \mathcal{N}(\mu_{\alpha_{BLOCK}},\sigma_{\alpha_{BLOCK}})\\ \alpha_{PROV} & \sim \mathcal{N}(\mu_{\alpha_{PROV}},\sigma_{\alpha_{PROV}})\\ \mu_{\alpha_{PROV}} & \sim \mathcal{N}(0,1)\\ \mu_{\alpha_{BLOCK}}& \sim \mathcal{N}(0,1)\\ \sigma_{\alpha_{PROV}} & \sim \text{HalfCauchy}(0,1)\\ \sigma_{\alpha_{BLOCK}} & \sim \text{HalfCauchy}(0,1)\\ \sigma & \sim \text{HalfCauchy}(0,1) \end{aligned} \end{equation}\]
data.list_mod2_2 <- list(N=length(data$height),          # Number of observations
                  y=data$height,                         # Response variables
                  age=data$age.sc,                       # Tree age
                  nprov=length(unique(data$prov)),       # Number of provenances
                  nblock=length(unique(data$block)),     # Number of blocks
                  prov=as.numeric(data$prov),            # Provenances
                  bloc=as.numeric(data$block))           # Blocks
mod2_2 = stan_model("mod2_2.stan")
fit.mod2_2 <- sampling(mod2_2, data = data.list_mod2_2, iter = 2000, chains = 2, cores = 2, control=list(max_treedepth=14))
print(fit.mod2_2, pars = c("beta_age","beta_age2",
                           "alpha_prov","alpha_block", 
                           "mean_alpha_prov","sigma_alpha_prov",
                           "mean_alpha_block","sigma_alpha_block",
                           "sigma_y"), probs = c(0.10, 0.5, 0.9))
y_rep <- as.matrix(fit.mod2_2, pars = "y_rep")
ppc_dens_overlay(y =data$height,y_rep[1:50, ]) + theme_bw() + theme(legend.text=element_text(size=25),
                                                                 legend.title=element_text(size=18),
                                                                 axis.text = element_text(size=18),
                                                                 legend.position = c(0.8,0.6)) 

McElreath: “[…] note that there is only one global mean parameter \(\alpha\), and both of the varying intercept parameters are centered at zero. We can’t identify a separate mean for each varying intercept type, because both intercepts are added to the same linear prediction. So it is conventional to define varying intercepts with a mean of zero, so there’s no risk of accidentally creating hard-to-identify parameters.” “If you do include a mean for each cluster type, it won’t be the end of the world, however.”

4.1.1 More informative priors

\[\begin{equation} \begin{aligned} h_{i} & \sim \text{LogNormal}(\mu_{i},\sigma_{i})\\ \mu_{i} & = \beta_{age}age_{i} + \beta_{age2}age^{2}_{i} + \alpha_{BLOCK[b]} + \alpha_{PROV[p]}\\ \beta_{age} & \sim \text{LogNormal}(0,1) \\ \beta_{age2} & \sim \mathcal{N}(0,1)\\ \alpha_{BLOCK} & \sim \mathcal{N}(\mu_{\alpha_{BLOCK}},\sigma_{\alpha_{BLOCK}})\\ \alpha_{PROV} & \sim \mathcal{N}(\mu_{\alpha_{PROV}},\sigma_{\alpha_{PROV}})\\ \mu_{\alpha_{PROV}} & \sim \mathcal{N}(0,1)\\ \mu_{\alpha_{BLOCK}}& \sim \mathcal{N}(0,1)\\ \sigma_{\alpha_{PROV}} & \sim \text{Exponential}(1)\\ \sigma_{\alpha_{BLOCK}} & \sim \text{Exponential}(1)\\ \sigma & \sim \text{HalfCauchy}(0,1) \end{aligned} \end{equation}\]
mod2_2_otherpriors = stan_model("mod2_2_otherpriors.stan")
fit.mod2_2_otherpriors <- sampling(mod2_2_otherpriors, data = data.list_mod2_2, iter = 2000, chains = 2, cores = 2, control=list(max_treedepth=14))
print(fit.mod2_2_otherpriors, pars = c("beta_age","beta_age2",
                           "alpha_prov","alpha_block", 
                           "mean_alpha_prov","sigma_alpha_prov",
                           "mean_alpha_block","sigma_alpha_block",
                           "sigma_y"), probs = c(0.10, 0.5, 0.9))

4.2 Centered-parameterization (what we should do)

\[\begin{equation} \begin{aligned} h_{i} & \sim \text{LogNormal}(\mu_{i},\sigma^{2}_{i})\\ \mu_{i} & = \alpha + \beta_{age}age_{i} + \beta_{age2}age^{2}_{i} + \alpha_{BLOCK[b]} + \alpha_{PROV[p]}\\ \beta_{age} & \sim \mathcal{N}(0,1) \\ \beta_{age2} & \sim \mathcal{N}(0,1)\\ \alpha & \sim \mathcal{N}(0,1)\\ \alpha_{BLOCK} & \sim \mathcal{N}(0,\sigma_{\alpha_{BLOCK}})\\ \alpha_{PROV} & \sim \mathcal{N}(0,\sigma_{\alpha_{PROV}})\\ \sigma_{\alpha_{PROV}} & \sim \text{HalfCauchy}(0,1)\\ \sigma_{\alpha_{BLOCK}} & \sim \text{HalfCauchy}(0,1)\\ \sigma & \sim \text{HalfCauchy}(0,1) \end{aligned} \end{equation}\]
mod2_3 = stan_model("mod2_3.stan")
fit.mod2_3 <- sampling(mod2_3, data = data.list_mod2_2, iter = 2000, chains = 2, cores = 2, control=list(max_treedepth=14))
print(fit.mod2_3, pars = c("beta_age","beta_age2",
                           "alpha",
                           "alpha_prov","alpha_block", 
                           "sigma_alpha_prov","sigma_alpha_block",
                           "sigma_y"), probs = c(0.10, 0.5, 0.9))
y_rep <- as.matrix(fit.mod2_3, pars = "y_rep")
ppc_dens_overlay(y =data$height,y_rep[1:50, ]) + theme_bw() + theme(legend.text=element_text(size=25),
                                                                 legend.title=element_text(size=18),
                                                                 axis.text = element_text(size=18),
                                                                 legend.position = c(0.8,0.6)) 

4.2.1 More informative priors

\[\begin{equation} \begin{aligned} h_{i} & \sim \text{LogNormal}(\mu_{i},\sigma^{2}_{i})\\ \mu_{i} & = \alpha + \beta_{age}age_{i} + \beta_{age2}age^{2}_{i} + \alpha_{BLOCK[b]} + \alpha_{PROV[p]}\\ \beta_{age} & \sim \text{LogNormal}(0,1) \\ \beta_{age2} & \sim \mathcal{N}(0,1)\\ \alpha & \sim \text{LogNormal}(0,1)\\ \alpha_{BLOCK} & \sim \mathcal{N}(0,\sigma_{\alpha_{BLOCK}})\\ \alpha_{PROV} & \sim \mathcal{N}(0,\sigma_{\alpha_{PROV}})\\ \sigma_{\alpha_{PROV}} & \sim \text{Exponential}(1)\\ \sigma_{\alpha_{BLOCK}} & \sim \text{Exponential}(1)\\ \sigma & \sim \text{Exponential}(1) \end{aligned} \end{equation}\]
mod2_3_otherpriors = stan_model("mod2_3_otherpriors.stan")
fit.mod2_3_otherpriors <- sampling(mod2_3_otherpriors, data = data.list_mod2_2, iter = 2000, chains = 2, cores = 2, control=list(max_treedepth=14))
print(fit.mod2_3_otherpriors, pars = c("beta_age","beta_age2",
                           "alpha",
                           "alpha_prov","alpha_block", 
                           "sigma_alpha_prov","sigma_alpha_block",
                           "sigma_y"), probs = c(0.10, 0.5, 0.9))

\(\alpha\) is very different between the two models!

With more iterations:

fit.mod2_3_otherpriors_iter3000 <- sampling(mod2_3_otherpriors, data = data.list_mod2_2, iter = 3000, chains = 2, cores = 2, control=list(max_treedepth=14))
print(fit.mod2_3_otherpriors_iter3000, pars = c("beta_age","beta_age2",
                           "alpha",
                           "alpha_prov","alpha_block", 
                           "sigma_alpha_prov","sigma_alpha_block",
                           "sigma_y"), probs = c(0.10, 0.5, 0.9))

4.3 Non-centered parameterization

\[\begin{equation} \begin{aligned} h_{i} & \sim \text{LogNormal}(\mu_{i},\sigma^{2}_{i})\\ \mu_{i} & = \alpha + \beta_{age}age_{i} + \beta_{age2}age^{2}_{i} + z_{BLOCK[b]}\sigma_{BLOCK} + z_{PROV[p]}\sigma_{PROV}\\ \beta_{age} & \sim \mathcal{N}(0,10) \\ \beta_{age2} & \sim \mathcal{N}(0,10)\\ \alpha & \sim \mathcal{N}(0,10)\\ z_{BLOCK} & \sim \mathcal{N}(0,1)\\ z_{PROV} & \sim \mathcal{N}(0,1)\\ \sigma_{PROV} & \sim HalfCauchy(0,1)\\ \sigma_{BLOCK} & \sim HalfCauchy(0,1)\\ \sigma & \sim HalfCauchy(0,1) \end{aligned} \end{equation}\]
mod2_4 = stan_model("mod2_4.stan")
fit.mod2_4 <- sampling(mod2_4, data = data.list_mod2_2, iter = 2000, chains = 2, cores = 2, control=list(max_treedepth=14))  
print(fit.mod2_4, pars = c("beta_age","beta_age2",
                           "alpha",
                           "z_prov","z_block", 
                           "sigma_prov","sigma_block",
                           "sigma_y"), probs = c(0.10, 0.5, 0.9))
y_rep <- as.matrix(fit.mod2_4, pars = "y_rep")
ppc_dens_overlay(y =data$height,y_rep[1:50, ]) + theme_bw() + theme(legend.text=element_text(size=25), 
                                                                 legend.title=element_text(size=18),
                                                                 axis.text = element_text(size=18),
                                                                 legend.position = c(0.8,0.6))
posterior_cp <- as.array(fit.mod2_4)
np_cp <- nuts_params(fit.mod2_4)  
mcmc_trace(posterior_cp, pars = c("alpha", "sigma_block", "sigma_prov","z_block[3]"), np = np_cp) + 
  xlab("Post-warmup iteration")

4.3.1 More informative priors

\[\begin{equation} \begin{aligned} h_{i} & \sim \text{LogNormal}(\mu_{i},\sigma^{2}_{i})\\ \mu_{i} & = \alpha + \beta_{age}age_{i} + \beta_{age2}age^{2}_{i} + z_{BLOCK[b]}\sigma_{BLOCK} + z_{PROV[p]}\sigma_{PROV}\\ \beta_{age} & \sim \text{LogNormal}(0,10) \\ \beta_{age2} & \sim \mathcal{N}(0,10)\\ \alpha & \sim \text{LogNormal}(0,10)\\ z_{BLOCK} & \sim \mathcal{N}(0,1)\\ z_{PROV} & \sim \mathcal{N}(0,1)\\ \sigma_{PROV} & \sim \text{Exponential}(0,1)\\ \sigma_{BLOCK} & \sim \text{Exponential}(1)\\ \sigma & \sim \text{Exponential}(0,1) \end{aligned} \end{equation}\]
mod2_4_otherpriors = stan_model("mod2_4_otherpriors.stan")
fit.mod2_4_otherpriors <- sampling(mod2_4_otherpriors, data = data.list_mod2_2, iter = 2000, chains = 2, cores = 2, control=list(max_treedepth=14))  
print(fit.mod2_4_otherpriors, pars = c("beta_age","beta_age2",
                           "alpha",
                           "z_prov","z_block", 
                           "sigma_prov","sigma_block",
                           "sigma_y"), probs = c(0.10, 0.5, 0.9))
y_rep <- as.matrix(fit.mod2_4_otherpriors, pars = "y_rep")
ppc_dens_overlay(y =data$height,y_rep[1:50, ]) + theme_bw() + theme(legend.text=element_text(size=25), 
                                                                 legend.title=element_text(size=18),
                                                                 axis.text = element_text(size=18),
                                                                 legend.position = c(0.8,0.6))
posterior_cp <- as.array(fit.mod2_4_otherpriors)
np_cp <- nuts_params(fit.mod2_4_otherpriors)  
mcmc_trace(posterior_cp, pars = c("alpha", "sigma_block", "sigma_prov","z_block[3]"), np = np_cp) + 
  xlab("Post-warmup iteration")

Let’s increase the target acceptance (adapt_delta=0.99)

McEleath (Second version) : “[…] the target acceptance rate is controlled by the adapt_delta control parameter. The default is 0.95, which means that it aims to attain a 95% acceptance rate. It tries this during the warmup phase, adjusting the step size of each leapfrog step (go back to Chapter 9 if these terms aren’t familiar). When adapt_delta is set high, it results in a smaller step size, which means a more accurate approximation of the curved surface. It also means more computation, which means a slower chain. Increasing adapt_delta can often, but not always, help with divergent transitions.”

fit.mod2_4_otherpriors_adaptdelta <- sampling(mod2_4_otherpriors, data = data.list_mod2_2, iter = 2000, chains = 2, cores = 2, control=list(max_treedepth=14,adapt_delta=0.99))
print(fit.mod2_4_otherpriors_adaptdelta, pars = c("beta_age","beta_age2",
                           "alpha",
                           "z_prov","z_block", 
                           "sigma_prov","sigma_block",
                           "sigma_y"), probs = c(0.10, 0.5, 0.9))

Longer to run, but it’s ok now ! And similar to model mod2_3_otherpriors (model with centered parameterization and more informative priors).

5 Varying intercepts and varying slopes

data.list_mod3 <- list(N=length(data$height),          # Number of observations
                  y=data$height,                         # Response variables
                  age=data$age.sc,                       # Tree age
                  nprov=length(unique(data$prov)),       # Number of provenances
                  nblock=length(unique(data$block)),     # Number of blocks
                  prov=as.numeric(data$prov),            # Provenances
                  bloc=as.numeric(data$block))           # Blocks

5.1 Centered parameterization

\[\begin{equation} \begin{aligned} h_{i} & \sim \text{LogNormal}(\mu_{i},\sigma_{i})\\[4pt] \mu_{i} & = \alpha + \alpha_{BLOCK[b]} + \alpha_{PROV[p]} + \beta_{age}age_{i} + \beta_{age2}age^{2}_{i} + \gamma_{PROV[p]}age_{i} \\[4pt] \begin{bmatrix} \alpha_{PROV[p]} \\ \gamma_{PROV[p]} \end{bmatrix} & \sim \text{MVNormal}\left(\begin{bmatrix} 0 \\ 0 \end{bmatrix},\mathbf{S} \right) \\[4pt] \mathbf{S} & = \begin{pmatrix} \sigma_{\alpha_{PROV[p]}} & 0 \\ 0 & \sigma_{\gamma_{PROV[p]}} \end{pmatrix} \begin{pmatrix} 1 & \rho \\ \rho & 1 \end{pmatrix} \begin{pmatrix} \sigma_{\alpha_{PROV[p]}} & 0 \\ 0 & \sigma_{\gamma_{PROV[p]}} \end{pmatrix} \\[4pt] \alpha & \sim \text{LogNormal}(0,1)\\[4pt] \beta_{age} & \sim \text{LogNormal}(0,1) \\[4pt] \beta_{age2} & \sim \mathcal{N}(0,1)\\[4pt] \alpha_{BLOCK} & \sim \mathcal{N}(0,\sigma_{\alpha_{BLOCK}})\\[4pt] \sigma_{\alpha_{BLOCK}} & \sim \text{Exponential}(1)\\[4pt] \sigma & \sim \text{Exponential}(1)\\[4pt] \sigma_{\gamma_{PROV[p]}} & \sim \text{Exponential}(1)\\[4pt] \sigma_{\alpha_{PROV}} & \sim \text{Exponential}(1)\\[4pt] \begin{pmatrix} 1 & \rho \\ \rho & 1 \end{pmatrix} & \sim \text{LKJcorr(2)} \end{aligned} \end{equation}\]

5.1.1 mod3_1

In this model, I followed the example from here: Stan code of Statistical Rethinking. 13.3 Example: cross-classified chimpanzees with varying slopes.

Even with 99% acceptance rate (adapt_delta=0.99)` and 3000 iterations, the model had some divergent transitions and small sample sizes.

One thing I didn’t understand with this model code is: where are LKJ and \(\sigma_{\alpha_{BLOCK}}\) priors?

mod3_1 = stan_model("mod3_1.stan")
fit.mod3_1 <- sampling(mod3_1, data = data.list_mod3 , iter = 3000, chains = 2, cores = 2, control=list(max_treedepth=14,adapt_delta=0.99))   
print(fit.mod3_1, probs = c(0.10, 0.5, 0.9))
y_rep <- as.matrix(fit.mod3_1, pars = "y_rep") 
ppc_dens_overlay(y =data$height,y_rep[1:50, ]) + theme_bw() + theme(legend.text=element_text(size=25), 
                                                                 legend.title=element_text(size=18),
                                                                 axis.text = element_text(size=18),
                                                                 legend.position = c(0.8,0.6))

Let’s call the correlation matrix \(\mathbf{R}\):

\[\mathbf{R} = \begin{pmatrix} 1 & \rho \\ \rho & 1 \end{pmatrix}\] If the prior was:

\[\mathbf{R} \sim \text{LKJcorr(2)}\]

There were more divergent transitions.

If the prior was:

\[\mathbf{R} \sim \text{LKJcorr(4)}\]

There were less divergent transitions, see the model below.

McElreath: “So whatever is the LKJcorr distribution? What LKJcorr(2) does is define a weakly informative prior on \(\rho\) that is skeptical of extreme correlations near −1 or 1. You can think of it as a regularizing prior for correlations. This distribution has a single parameter, \(\eta\), that controls how skeptical the prior is of large correlations in the matrix. When we use LKJcorr(1), the prior is flat over all valid correlation matrices. When the value is greater than 1, such as the 2 we used above, then extreme correlations are less likely. To visualize this family of priors, it will help to sample random matrices from it and plot the distribution of correlations”

Rho2 <- rlkjcorr( 1e4 , K=2 , eta=2 )
Rho1 <- rlkjcorr( 1e4 , K=2 , eta=1 )
Rho4 <- rlkjcorr( 1e4 , K=2 , eta=4 )
plot_grid(dens(Rho1[,1,2],xlim=c(-1,1),xlab="correlation" ,ylim=c(0,1.2),main="eta=1"),
         dens(Rho2[,1,2] , xlab="correlation" ,xlim=c(-1,1),ylim=c(0,1.2),main="eta=2"),
         dens(Rho4[,1,2],xlim=c(-1,1),xlab="correlation" ,ylim=c(0,1.2),main="eta=4"))

5.1.2 mod3_2

Model with \(\mathbf{R} \sim \text{LKJcorr(4)}\)

mod3_2 = stan_model("mod3_2.stan")  
fit.mod3_2 <- sampling(mod3_2, data = data.list_mod3 , iter = 3000, chains = 2, cores = 2, control=list(max_treedepth=14,adapt_delta=0.99))   
print(fit.mod3_2, pars = c("beta_age","beta_age2",
                           "alpha","alpha_block", "sigma_block",
                           "Rho_prov","sigma_prov","alpha_prov","beta_prov","v_prov","SRS_prov",
                           "sigma_y"), probs = c(0.10, 0.5, 0.9)) 
y_rep <- as.matrix(fit.mod3_2, pars = "y_rep") 
ppc_dens_overlay(y =data$height,y_rep[1:50, ]) + theme_bw() + theme(legend.text=element_text(size=25), 
                                                                 legend.title=element_text(size=18),
                                                                 axis.text = element_text(size=18),
                                                                 legend.position = c(0.8,0.6))

5.2 Non-centered parameterization

\[\begin{equation} \begin{aligned} h_{i} & \sim \text{LogNormal}(\mu_{i},\sigma_{i})\\[4pt] \mu_{i} & = \alpha + z_{\alpha_{BLOCK[b]}}\sigma_{\alpha_{BLOCK}} + z_{\alpha_{PROV[p]}}\sigma_{\alpha_{PROV}} + \beta_{age}age_{i} + \beta_{age2}age^{2}_{i} + z_{\gamma_{PROV[p]}}\sigma_{\gamma_{PROV}}age_{i} \\[4pt] \begin{bmatrix} z_{\alpha_{PROV[p]}} \\ z_{\gamma_{PROV[p]}} \end{bmatrix} & \sim \text{MVNormal}\left(\begin{bmatrix} 0 \\ 0 \end{bmatrix},\mathbf{R} \right) \\[4pt] \beta_{age} & \sim \text{LogNormal}(0,1) \\[4pt] \beta_{age2} & \sim \mathcal{N}(0,1)\\[4pt] \alpha & \sim \text{LogNormal}(0,1)\\[4pt] z_{\alpha_{BLOCK[p]}} & \sim \mathcal{N}(0,1)\\[4pt] \sigma_{\alpha_{BLOCK}} & \sim \text{Exponential}(1)\\[4pt] \sigma & \sim \text{Exponential}(1)\\[4pt] \sigma_{\gamma_{PROV}} & \sim \text{Exponential}(1)\\[4pt] \sigma_{\alpha_{PROV}} & \sim \text{Exponential}(1)\\[4pt] \mathbf{R}& \sim \text{LKJcorr(4)} \end{aligned} \end{equation}\]

5.2.1 Following McElreath

Statistical rethinking (first version): m13_6NC1 P405

Stan code here: Code model using non-centered parameterization.

mod3_3 = stan_model("mod3_3.stan") 
fit.mod3_3 <- sampling(mod3_3, data = data.list_mod3 , iter = 3000, chains = 2, cores = 2, control=list(max_treedepth=14,adapt_delta=0.99)) 
print(fit.mod3_3, pars = c("beta_age","beta_age2",
                           "alpha","z_alpha_block", "sigma_block",
                           "Rho_prov","sigma_prov","z_alpha_prov","z_beta_prov","v_prov",
                           "sigma_y"), probs = c(0.10, 0.5, 0.9)) 
y_rep <- as.matrix(fit.mod3_3, pars = "y_rep")  
ppc_dens_overlay(y =data$height,y_rep[1:50, ]) + theme_bw() + theme(legend.text=element_text(size=25), 
                                                                 legend.title=element_text(size=18),
                                                                 axis.text = element_text(size=18),
                                                                 legend.position = c(0.8,0.6))
posterior_cp <- as.array(fit.mod3_3) 
np_cp <- nuts_params(fit.mod3_3)  
mcmc_trace(posterior_cp, pars =c( "alpha","sigma_prov[1]"), np = np_cp) + 
  xlab("Post-warmup iteration")
mcmc_pairs(posterior_cp, np = np_cp, pars = c("alpha","beta_age","beta_age2","sigma_y","sigma_prov[1]","sigma_prov[2]"),  
           off_diag_args = list(size = 1, alpha = 1/3),np_style = pairs_style_np(div_size=3, div_shape = 19))

Comment: I tried \(\alpha \sim \mathcal{N}(0,1)\), chains didn’t mix. Very high R-hat values and lots of divergent transitions.

5.2.2 Following Sorensen

Sorensen et al. 2016. Listing 8.

data.list_mod3_4 <- list(N=length(data$height),          # Number of observations
                  y=data$height,                         # Response variables
                  age=data$age.sc,                       # Tree age
                  nprov=length(unique(data$prov)),       # Number of provenances
                  nblock=length(unique(data$block)),     # Number of blocks
                  prov=as.numeric(data$prov),            # Provenances
                  bloc=as.numeric(data$block))           # Blocks
mod3_4 = stan_model("mod3_4.stan")  
fit.mod3_4 <- sampling(mod3_4, data = data.list_mod3_4 , iter = 2000, chains = 2, cores = 2, control=list(max_treedepth=14,adapt_delta=0.999))  
print(fit.mod3_4, probs = c(0.10, 0.5, 0.9))
y_rep <- as.matrix(fit.mod3_4, pars = "y_rep")
ppc_dens_overlay(y =data$height,y_rep[1:50, ]) + theme_bw() + theme(legend.text=element_text(size=25), 
                                                                 legend.title=element_text(size=18),
                                                                 axis.text = element_text(size=18),
                                                                 legend.position = c(0.8,0.6))

Comment: \(\alpha\) has very different values between McElreath and Sorensen models..